{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import jsonlines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_json(file_path):\n",
    "    data = []\n",
    "    with open(file_path, \"r\") as f:\n",
    "        for json_str in f:  # f is a file object\n",
    "            data.append(json.loads(json_str))\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1319\n"
     ]
    }
   ],
   "source": [
    "file_path = \"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/gsm8k/gsm.chat.jsonl\"\n",
    "contents = read_json(file_path)\n",
    "print(len(contents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['input', 'target', 'answer', 'score', 'generation'])\n",
      "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\n",
      "18\n",
      "18.0\n",
      "1\n",
      "['```\\ndef solution():\\n    \"\"\"Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers\\' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers\\' market?\"\"\"\\n    eggs_per_day = 16\\n    eggs_for_breakfast = 3\\n    eggs_for_muffins = 4\\n    eggs_for_sale = eggs_per_day - eggs_for_breakfast - eggs_for_muffins\\n    money_per_egg = 2\\n    money_made = eggs_for_sale * money_per_egg\\n    result = money_made\\n    return result\\n```']\n"
     ]
    }
   ],
   "source": [
    "print(contents[0].keys())\n",
    "print(contents[0][\"input\"])\n",
    "print(contents[0][\"target\"])\n",
    "print(contents[0][\"answer\"])\n",
    "print(contents[0][\"score\"])\n",
    "print(contents[0][\"generation\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "264\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "data = pd.DataFrame(contents)\n",
    "\n",
    "sub_data = data.loc[(data['score'] == 0)]\n",
    "print(len(sub_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "random_indices = random.sample(range(0, len(sub_data)), 100)\n",
    "questions = []\n",
    "question_id = 0\n",
    "for i in random_indices:\n",
    "    questions.append({\"qid\": \"easy-gsm8k-{:0>4d}\".format(question_id), \"question\": sub_data.iloc[i][\"input\"], \"answer\": float(sub_data.iloc[i][\"target\"])})\n",
    "    question_id += 1\n",
    "\n",
    "with jsonlines.open(\"/<YOUR_OWN_PATH>/ToolQA/data/questions/easy/gsm8k-easy.jsonl\", mode='w') as writer:\n",
    "    for row in questions:\n",
    "        writer.write(row)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
